Skipping install of 'dsbox' from a github remote, the SHA1 (244ecdfe) has not changed since last install.
Use `force = TRUE` to force installation
# set theme for ggplot2ggplot2::theme_set(ggplot2::theme_minimal(base_size =14))# set width of code outputoptions(width =65)# set figure parameters for knitrknitr::opts_chunk$set(fig.width =7, # 7" widthfig.asp =0.618, # the golden ratiofig.retina =3, # dpi multiplier for displaying HTML output on retinafig.align ="center", # center align figuresdpi =300# higher dpi, sharper image)###All responses are in comments within the code****
2 1 - Road traffic accidents in Edinburgh
# Reading the Datasetaccidents <-read_csv("data/accidents.csv")
Rows: 768 Columns: 31
── Column specification ─────────────────────────────────────────
Delimiter: ","
chr (18): id, severity, date, day_of_week, highway, first_ro...
dbl (12): easting, northing, longitude, latitude, police_for...
time (1): time
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
accidents <- accidents %>%mutate(day_type =if_else(wday(date) ==1|wday(date) ==7, # Sunday (1) or Saturday (7)"Weekend","Weekday" ))ggplot(accidents, aes(x = time, fill = severity)) +geom_density(alpha =0.7) +# facet_wrap(~ day_type) +facet_wrap(~day_type, ncol =1) +scale_fill_manual(values =c("Fatal"="gray", "Serious"="deepskyblue3", "Slight"="yellow3")) +# Closest color I can get byscale_color_colorblind() +scale_x_time(breaks = hms::as_hms(c("00:00:00", "04:00:00", "08:00:00", "12:00:00", "16:00:00", "20:00:00", "24:00:00")) ) +labs(title ="Number of accidents throughout the day",subtitle ="By day of week and severity",x ="Time of day",y ="Density",fill ="Severity" ) +theme(legend.position ="right" )
Interpretation:
Plot shows the distribution of accidents categorized into 3 severity and plotted in 2 charts covering weekend & weekdays
3 2 - NYC marathon winners
3.1 (a)
# Reading the Datasetmarathon <-read_csv("data/nyc_marathon.csv")
Rows: 102 Columns: 7
── Column specification ─────────────────────────────────────────
Delimiter: ","
chr (4): name, country, division, note
dbl (2): year, time_hrs
time (1): time
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# (a) marathon_a <- marathon %>%mutate(time_seconds =period_to_seconds(hms(time))) %>%filter(is.finite(time_seconds))#Plot histogramggplot(marathon_a, aes(x = time_seconds /60)) +geom_histogram(binwidth =10, fill ="green4", color ="black") +labs(title ="Distribution of Finish Times",x ="Finish Time",y ="Number of Runners" )
#Plot Boxplotggplot(marathon_a, aes(x = time_seconds /60)) +geom_boxplot(fill ="lightgreen", color ="green4", outlier.color ="black") +labs(title ="Boxplot of Finish Times",x ="Finish Time (minutes)" )
### Boxplot has ability to show outliers & medians effectively, while histogram shows more detailed distribution of data as towers
3.2 (b)
# (b)#Plot Boxplotggplot(marathon_a, aes(x = division, y = time_seconds /60, fill = division)) +geom_boxplot(alpha =0.6, color ="darkgray") +labs(title ="Finish Times by Gender",x ="Division",y ="Finish Time" ) +scale_fill_manual(values =c("Men"="green4", "Women"="orange"))
### Men has aggressive median Finish times than Women and Women runners are more staggered outliers compared to Men
3.3 (c)
# (c)### Legend is redundant, removing it decrease amount of information in plot and makes less content and more clarity#Plot Boxplotggplot(marathon_a, aes(x = division, y = time_seconds /60, fill = division)) +geom_boxplot(alpha =0.6, color ="darkgray") +labs(title ="Finish Times by Gender",x ="Division",y ="Finish Time" ) +scale_fill_manual(values =c("Men"="green4", "Women"="orange")) +theme(legend.position="none")
3.4 (d)
# (d)ggplot(marathon_a, aes(x = year, y = time_seconds /60, color = division)) +geom_line(size =1) +geom_point(alpha =0.7) +scale_color_manual(values =c("Men"="black", "Women"="blue") )
Warning: Using `size` aesthetic for lines was deprecated in ggplot2
3.4.0.
ℹ Please use `linewidth` instead.
labs(title ="Winning Times Over the Years",x ="Year",y ="Finish Time",color ="Division" )
$x
[1] "Year"
$y
[1] "Finish Time"
$colour
[1] "Division"
$title
[1] "Winning Times Over the Years"
attr(,"class")
[1] "labels"
### We can infer trends here compared to other plots
4 3 - US counties
4.1 (a)
### (a) Since X-axis is completely different for both, boxplot didn't even render... Don't think this snippet is using correct paramsggplot(county) +geom_point(aes(x = median_edu, y = median_hh_income)) +geom_boxplot(aes(x = smoking_ban, y = pop2017))
Warning: Removed 3 rows containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).
4.2 (b)
### (b) Plots with column facet grid looks visually clear and easy to comprehend, use row facet based grid when you want to avoid horizontal scrolling, while the other one when there are moderate number of variables for better clarityggplot(county %>%filter(!is.na(median_edu))) +geom_point(aes(x = homeownership, y = poverty)) +facet_grid(median_edu ~ .)
Warning: Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).
##Plot Bggplot(county, aes(x = homeownership, y = poverty)) +geom_point() +geom_smooth(method ="loess", se =FALSE, color ="blue") +ggtitle("Plot B")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).
##Plot Cggplot(county, aes(x = homeownership, y = poverty)) +geom_point() +geom_smooth(method ="gam", se =FALSE, color ="green", size =1.2) +geom_smooth(method ="glm", se =FALSE, color ="green", size =1.2) +ggtitle("Plot C")
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).
##Plot Dggplot(county, aes(x = homeownership, y = poverty)) +geom_point() +geom_smooth(data =subset(county, homeownership >25& homeownership <85), se =FALSE, method ="lm", color ="blue") +geom_smooth(data =subset(county, homeownership >25& homeownership <85), se =FALSE, method ="gam", color ="blue") +ggtitle("Plot D")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).
##Plot Eggplot(county, aes(x = homeownership, y = poverty, color = metro)) +geom_point() +geom_smooth(method ="lm", aes(linetype = metro), se =FALSE) +ggtitle("Plot E")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).
##Plot Fggplot(county, aes(x = homeownership, y = poverty, color = metro)) +geom_point() +geom_smooth(method ="gam", se =FALSE) +ggtitle("Plot F")
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).
##Plot Gggplot(county, aes(x = homeownership, y = poverty)) +geom_point(aes(color = metro)) +geom_smooth(method ="loess", se =FALSE, color ="blue") +ggtitle("Plot G")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).
##Plot Hggplot(county, aes(x = homeownership, y = poverty, color = metro)) +geom_point() +ggtitle("Plot H")
Warning: Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).
5 4 - Credit Card Balances
creditcard <-read_csv("data/credit.csv")
Rows: 400 Columns: 5
── Column specification ─────────────────────────────────────────
Delimiter: ","
chr (2): student, married
dbl (3): balance, income, limit
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### As Income increases Credit Card Balance too increases. Married people tends to have higher balances.
### (b)### Married Students tend to have considerably lesser balances and at the same time relatively higher salries are earned. ANd these variables are great in predicting the balance trends and helps in highlighting the clear differences
### (d)### The relationship btw Income & Credit utilization is completely different from (b). Credit utilization by married is more compared to other group. This difference is good for interpreting the Credit behavior with various sets of people.
6 5 - Napoleon’s march.
### Reference from -- https://www.andrewheiss.com/blog/2017/08/10/exploring-minards-1812-plot-with-ggplot2/### Detailed explanation below for this plot's snippets### Load data into napoleon variable from the RDS file (RDS is an R Object file) which contains 3 components - troop movement, cities & temperature data napoleon <-read_rds("data/napoleon.rds")### Extra respective data components to the variablescities <- napoleon[[1]]temp <- napoleon[[2]]troops <- napoleon[[3]]### Create new variable which will combine temp & datetemps.nice <- temp %>%mutate(nice.label =paste0(temp, "°, ", month, ". ", day))###Instantiate the plot with geom function to draw line, which represent the path of troop movement... Colored by direction of the movement & sized by number of survivors. march.1812.plot.simple <-ggplot() +geom_path(data = troops, aes(x = long, y = lat, group = group,color = direction, size = survivors),lineend ="round") +### Adding cities as points in the plot as Orange colorgeom_point(data = cities, aes(x = long, y = lat),color ="orange") +### Adding labels for Citiesgeom_text_repel(data = cities, aes(x = long, y = lat, label = city),color ="orange") +### Adjusts Thickness scale_size(range =c(0.5, 10)) +### Manually set color for the directionscale_colour_manual(values =c("blue", "black")) +### Hide Legend (for ink to data ratio)guides(color =FALSE, size =FALSE) +### Customized Labelslabs(title ="Napoleon's Troop Campaign",subtitle ="Path and size of the army during the campaign",x ="Longitude",y ="Latitude",caption ="Data source: https://www.andrewheiss.com/blog/2017/08/10/exploring-minards-1812-plot-with-ggplot2/" )
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use
"none" instead as of ggplot2 3.3.4.
### New PLot showing temp against longitudetemps.1812.plot <-ggplot(data = temps.nice, aes(x = long, y = temp)) +geom_line() +### Formatted Labelsgeom_label(aes(label = nice.label), size =2.5) +### Add label to Y axislabs(x =NULL, y ="° Celsius") +### Plot similar to match above plot of troop movementscale_x_continuous(limits =ggplot_build(march.1812.plot.simple)$layout$panel_ranges[[1]]$x.range) +### Move label to Rightscale_y_continuous(position ="right") +coord_cartesian(ylim =c(-35, 5)) +### Remove the grid components, borders & text at X axistheme(panel.grid.major.x =element_blank(),panel.grid.minor.x =element_blank(),panel.grid.minor.y =element_blank(),axis.text.x =element_blank(), axis.ticks =element_blank(),panel.border =element_blank())### Combine 2 plotsboth.1812.plot.simple <-rbind(ggplotGrob(march.1812.plot.simple),ggplotGrob(temps.1812.plot))panels <- both.1812.plot.simple$layout$t[grep("panel", both.1812.plot.simple$layout$name)]### Adjust ratio of height for both plots acording to inputboth.1812.plot.simple$heights[panels] <-unit(c(3, 1), "null")### Final combined plotgrid::grid.newpage()grid::grid.draw(both.1812.plot.simple)
Source Code
---title: "HW 01"author: "Nandakumar Kuthalaraja"format: html: embed-resources: true code-fold: false code-tools: true toc: true number-sections: truehighlight-style: a11yexecute: code-link: trueeditor_options: chunk_output_type: console---## 0 - Setup```{r setup}if (!require("pacman")) install.packages("pacman")pacman::p_load(tidyverse, glue, scales, ggthemes, openintro, ggrepel)devtools::install_github("tidyverse/dsbox")# set theme for ggplot2ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))# set width of code outputoptions(width = 65)# set figure parameters for knitrknitr::opts_chunk$set( fig.width = 7, # 7" width fig.asp = 0.618, # the golden ratio fig.retina = 3, # dpi multiplier for displaying HTML output on retina fig.align = "center", # center align figures dpi = 300 # higher dpi, sharper image)###All responses are in comments within the code****```## 1 - Road traffic accidents in Edinburgh```{r label-me-1}# Reading the Datasetaccidents <- read_csv("data/accidents.csv")accidents <- accidents %>% mutate( day_type = if_else( wday(date) == 1 | wday(date) == 7, # Sunday (1) or Saturday (7) "Weekend", "Weekday" ))ggplot(accidents, aes(x = time, fill = severity)) + geom_density(alpha = 0.7) + # facet_wrap(~ day_type) + facet_wrap(~day_type, ncol = 1) + scale_fill_manual(values = c("Fatal" = "gray", "Serious" = "deepskyblue3", "Slight" = "yellow3")) + # Closest color I can get by scale_color_colorblind() + scale_x_time( breaks = hms::as_hms(c("00:00:00", "04:00:00", "08:00:00", "12:00:00", "16:00:00", "20:00:00", "24:00:00")) ) + labs( title = "Number of accidents throughout the day", subtitle = "By day of week and severity", x = "Time of day", y = "Density", fill = "Severity" ) + theme( legend.position = "right" )```::: hand**Interpretation**:Plot shows the distribution of accidents categorized into 3 severity and plotted in 2 charts covering weekend & weekdays:::## 2 - NYC marathon winners### (a)```{r label-me-6}# Reading the Datasetmarathon <- read_csv("data/nyc_marathon.csv")``````{r label-me-3}# (a) marathon_a <- marathon %>% mutate(time_seconds = period_to_seconds(hms(time))) %>% filter(is.finite(time_seconds)) #Plot histogram ggplot(marathon_a, aes(x = time_seconds / 60)) + geom_histogram(binwidth = 10, fill = "green4", color = "black") + labs( title = "Distribution of Finish Times", x = "Finish Time", y = "Number of Runners" ) ``````{r label-me-3a} #Plot Boxplot ggplot(marathon_a, aes(x = time_seconds / 60)) + geom_boxplot(fill = "lightgreen", color = "green4", outlier.color = "black") + labs( title = "Boxplot of Finish Times", x = "Finish Time (minutes)" ) ### Boxplot has ability to show outliers & medians effectively, while histogram shows more detailed distribution of data as towers```### (b)```{r label-me-4}# (b) #Plot Boxplot ggplot(marathon_a, aes(x = division, y = time_seconds / 60, fill = division)) + geom_boxplot(alpha = 0.6, color = "darkgray") + labs( title = "Finish Times by Gender", x = "Division", y = "Finish Time" ) + scale_fill_manual(values = c("Men" = "green4", "Women" = "orange")) ### Men has aggressive median Finish times than Women and Women runners are more staggered outliers compared to Men```### (c)```{r label-me-5}# (c) ### Legend is redundant, removing it decrease amount of information in plot and makes less content and more clarity #Plot Boxplot ggplot(marathon_a, aes(x = division, y = time_seconds / 60, fill = division)) + geom_boxplot(alpha = 0.6, color = "darkgray") + labs( title = "Finish Times by Gender", x = "Division", y = "Finish Time" ) + scale_fill_manual(values = c("Men" = "green4", "Women" = "orange")) + theme(legend.position="none")```### (d)```{r label-me-2}# (d) ggplot(marathon_a, aes(x = year, y = time_seconds / 60, color = division)) + geom_line(size = 1) + geom_point(alpha = 0.7) + scale_color_manual( values = c("Men" = "black", "Women" = "blue") ) labs( title = "Winning Times Over the Years", x = "Year", y = "Finish Time", color = "Division" ) ### We can infer trends here compared to other plots```## 3 - US counties### (a)```{r}### (a) Since X-axis is completely different for both, boxplot didn't even render... Don't think this snippet is using correct paramsggplot(county) +geom_point(aes(x = median_edu, y = median_hh_income)) +geom_boxplot(aes(x = smoking_ban, y = pop2017))```### (b)```{r}### (b) Plots with column facet grid looks visually clear and easy to comprehend, use row facet based grid when you want to avoid horizontal scrolling, while the other one when there are moderate number of variables for better clarityggplot(county %>%filter(!is.na(median_edu))) +geom_point(aes(x = homeownership, y = poverty)) +facet_grid(median_edu ~ .)ggplot(county %>%filter(!is.na(median_edu))) +geom_point(aes(x = homeownership, y = poverty)) +facet_grid(. ~ median_edu)```### (c)```{r}### (c)##Plot Aggplot(county, aes(x = homeownership, y = poverty)) +geom_point() +ggtitle("Plot A")##Plot Bggplot(county, aes(x = homeownership, y = poverty)) +geom_point() +geom_smooth(method ="loess", se =FALSE, color ="blue") +ggtitle("Plot B")##Plot Cggplot(county, aes(x = homeownership, y = poverty)) +geom_point() +geom_smooth(method ="gam", se =FALSE, color ="green", size =1.2) +geom_smooth(method ="glm", se =FALSE, color ="green", size =1.2) +ggtitle("Plot C")##Plot Dggplot(county, aes(x = homeownership, y = poverty)) +geom_point() +geom_smooth(data =subset(county, homeownership >25& homeownership <85), se =FALSE, method ="lm", color ="blue") +geom_smooth(data =subset(county, homeownership >25& homeownership <85), se =FALSE, method ="gam", color ="blue") +ggtitle("Plot D")##Plot Eggplot(county, aes(x = homeownership, y = poverty, color = metro)) +geom_point() +geom_smooth(method ="lm", aes(linetype = metro), se =FALSE) +ggtitle("Plot E")##Plot Fggplot(county, aes(x = homeownership, y = poverty, color = metro)) +geom_point() +geom_smooth(method ="gam", se =FALSE) +ggtitle("Plot F")##Plot Gggplot(county, aes(x = homeownership, y = poverty)) +geom_point(aes(color = metro)) +geom_smooth(method ="loess", se =FALSE, color ="blue") +ggtitle("Plot G")##Plot Hggplot(county, aes(x = homeownership, y = poverty, color = metro)) +geom_point() +ggtitle("Plot H")```## 4 - Credit Card Balances```{r}creditcard <-read_csv("data/credit.csv")``````{r}### (a)ggplot(creditcard, aes(x = income, y = balance)) +geom_point(aes(color = student, shape = student), alpha =0.5) +geom_smooth(method ="lm", se =FALSE, aes(color = student)) +scale_color_manual(values =c("No"="orange", "Yes"="pink3")) +facet_grid(student ~ married, labeller =labeller(married = label_both,student = label_both)) +scale_x_continuous(labels =label_dollar(prefix="$", suffix ="k")) +scale_y_continuous(labels =label_dollar(prefix="$")) +labs(x ="Income", y ="Credit card balance") +theme(strip.background =element_rect(fill ="gray90", color ="black"),strip.text =element_text(face ="bold"), legend.position ="none",panel.border =element_rect(color ="black", fill =NA) )### As Income increases Credit Card Balance too increases. Married people tends to have higher balances.``````{r}### (b)### Married Students tend to have considerably lesser balances and at the same time relatively higher salries are earned. ANd these variables are great in predicting the balance trends and helps in highlighting the clear differences``````{r}### (c) creditcard <- creditcard %>%mutate(utz = balance / limit)ggplot(creditcard, aes(x = income, y = utz)) +geom_point(aes(color = student, shape = student), alpha =0.5) +geom_smooth(method ="lm", se =FALSE, aes(color = student)) +scale_color_manual(values =c("No"="orange", "Yes"="pink3")) +facet_grid(student ~ married, labeller =labeller(student =c("No"="student: No", "Yes"="student: Yes"),married =c("No"="married: No", "Yes"="married: Yes") )) +scale_y_continuous(labels =percent_format(accuracy =1)) +scale_x_continuous(labels =label_dollar(prefix="$", suffix ="k")) +labs(x ="Income",y ="Credit utilization" ) +theme(strip.background =element_rect(fill ="gray90", color ="black"),strip.text =element_text(face ="bold"), legend.position ="none",panel.border =element_rect(color ="black", fill =NA) )``````{r}### (d)### The relationship btw Income & Credit utilization is completely different from (b). Credit utilization by married is more compared to other group. This difference is good for interpreting the Credit behavior with various sets of people.```## 5 - Napoleon’s march.```{r}### Reference from -- https://www.andrewheiss.com/blog/2017/08/10/exploring-minards-1812-plot-with-ggplot2/### Detailed explanation below for this plot's snippets### Load data into napoleon variable from the RDS file (RDS is an R Object file) which contains 3 components - troop movement, cities & temperature data napoleon <-read_rds("data/napoleon.rds")### Extra respective data components to the variablescities <- napoleon[[1]]temp <- napoleon[[2]]troops <- napoleon[[3]]### Create new variable which will combine temp & datetemps.nice <- temp %>%mutate(nice.label =paste0(temp, "°, ", month, ". ", day))###Instantiate the plot with geom function to draw line, which represent the path of troop movement... Colored by direction of the movement & sized by number of survivors. march.1812.plot.simple <-ggplot() +geom_path(data = troops, aes(x = long, y = lat, group = group,color = direction, size = survivors),lineend ="round") +### Adding cities as points in the plot as Orange colorgeom_point(data = cities, aes(x = long, y = lat),color ="orange") +### Adding labels for Citiesgeom_text_repel(data = cities, aes(x = long, y = lat, label = city),color ="orange") +### Adjusts Thickness scale_size(range =c(0.5, 10)) +### Manually set color for the directionscale_colour_manual(values =c("blue", "black")) +### Hide Legend (for ink to data ratio)guides(color =FALSE, size =FALSE) +### Customized Labelslabs(title ="Napoleon's Troop Campaign",subtitle ="Path and size of the army during the campaign",x ="Longitude",y ="Latitude",caption ="Data source: https://www.andrewheiss.com/blog/2017/08/10/exploring-minards-1812-plot-with-ggplot2/" )### New PLot showing temp against longitudetemps.1812.plot <-ggplot(data = temps.nice, aes(x = long, y = temp)) +geom_line() +### Formatted Labelsgeom_label(aes(label = nice.label), size =2.5) +### Add label to Y axislabs(x =NULL, y ="° Celsius") +### Plot similar to match above plot of troop movementscale_x_continuous(limits =ggplot_build(march.1812.plot.simple)$layout$panel_ranges[[1]]$x.range) +### Move label to Rightscale_y_continuous(position ="right") +coord_cartesian(ylim =c(-35, 5)) +### Remove the grid components, borders & text at X axistheme(panel.grid.major.x =element_blank(),panel.grid.minor.x =element_blank(),panel.grid.minor.y =element_blank(),axis.text.x =element_blank(), axis.ticks =element_blank(),panel.border =element_blank())### Combine 2 plotsboth.1812.plot.simple <-rbind(ggplotGrob(march.1812.plot.simple),ggplotGrob(temps.1812.plot))panels <- both.1812.plot.simple$layout$t[grep("panel", both.1812.plot.simple$layout$name)]### Adjust ratio of height for both plots acording to inputboth.1812.plot.simple$heights[panels] <-unit(c(3, 1), "null")### Final combined plotgrid::grid.newpage()grid::grid.draw(both.1812.plot.simple)```